# Transparent Corruption: The effect of illicit connections and trusted references 
# on the demand for bureaucratic intermediation

# Authors: Jose Ramon Morales-Arilla & Ana Gabriela Ibarra Luces

# SCRIPT 01 - DATA CLEANSING

# Packages ----------------------------------------------------------------

library(tidyverse)
library(haven)
library(fixest)
library(sjlabelled)
library(huxtable)
library(patchwork)
library(writexl)
library(xtable)
library(sjmisc)
library(summarytools)
library(tibble)

rm(list = ls())

# Functions  --------------------------------------------------------------

# Comparing NA values 
compareNA <- function(v1, v2) 
{
   same <- (v1 == v2) | (is.na(v1) & is.na(v2))
   same[is.na(same)] <- FALSE
   return(same)
}

# Read data --------------------------------------------------------------------

# Reading raw dataset 
survey_data <- read_sav("Data/Raw/Survey_Servicios_Burocraticos_October 5, 2022_13.17.sav")


# Data processing  --------------------------------------------------------

# Modify variable values
clean_survey <- survey_data %>% 
   dplyr::mutate(
                 
                 # Informed consent 
                 consent = ifelse(consent == 2, 0, consent), 
                 
                 # Outcome for exercise with no corruption suggestion,
                 # experienced gestor, low price and fast response time
                 y_no_corr_exp_lowp_q = case_when(no_corr_exp_lowp_q == 4 ~ 1,
                                                  no_corr_exp_lowp_q == 5 ~ 0, 
                                                  T ~ no_corr_exp_lowp_q),
                 
                 # Outcome for exercise with no corruption suggestion,
                 # experienced gestor, high price and fast response time
                 y_no_corr_exp_hip_q = case_when(no_corr_exp_hip_q == 4 ~ 1,
                                                 no_corr_exp_hip_q == 5 ~ 0,
                                                 T ~ no_corr_exp_hip_q),
                 
                 # Outcome for exercise with no corruption suggestion,
                 # experienced gestor, low price and slow response time
                 y_no_corr_exp_lowp_s = case_when(no_corr_exp_lowp_s == 4 ~ 1, 
                                                  no_corr_exp_lowp_s == 5 ~ 0, 
                                                  T ~ no_corr_exp_lowp_s),
                 
                 # Outcome for exercise with no corruption suggestion,
                 # experienced gestor, high price and slow response time
                 y_no_corr_exp_hip_s = case_when(no_corr_exp_hip_s == 4 ~ 1,
                                                 no_corr_exp_hip_s == 5 ~ 0, 
                                                 T ~ no_corr_exp_hip_s),
                 
                 # Outcome for exercise with no corruption suggestion,
                 # non-experienced gestor, low price and fast response time
                 y_no_corr_no_ex_lowp_q = case_when(no_corr_no_ex_lowp_q == 4 ~ 1,
                                                    no_corr_no_ex_lowp_q == 5 ~ 0, 
                                                    T ~ no_corr_no_ex_lowp_q),
                 
                 # Outcome for exercise with no corruption suggestion,
                 # non-experienced gestor, high price and fast response time
                 y_no_corr_no_ex_hip_q = case_when(no_corr_no_ex_hip_q == 4 ~ 1, 
                                                   no_corr_no_ex_hip_q == 5 ~ 0, 
                                                   T ~  no_corr_no_ex_hip_q),
                 
                 # Outcome for exercise with no corruption suggestion,
                 # non-experienced gestor, low price and slow response time
                 y_no_corr_no_ex_lowp_s = case_when(no_corr_no_ex_lowp_s == 4 ~ 1,
                                                    no_corr_no_ex_lowp_s == 5 ~ 0,
                                                    T ~ no_corr_no_ex_lowp_s),
                 
                 # Outcome for exercise with no corruption suggestion,
                 # non-experienced gestor, high price and slow response time
                 y_no_corr_no_ex_hip_s = case_when(no_corr_no_ex_hip_s == 4 ~ 1,
                                                   no_corr_no_ex_hip_s == 5 ~ 0,
                                                   T ~ no_corr_no_ex_hip_s),
                 
                 # Outcome for exercise with corruption suggestion,
                 # experienced gestor, low price and fast response time
                 y_corr_exp_lowp_q = case_when(corr_exp_lowp_q == 4 ~ 1, 
                                               corr_exp_lowp_q == 5 ~ 0, 
                                               T ~ corr_exp_lowp_q),
                 
                 # Outcome for exercise with corruption suggestion,
                 # experienced gestor, high price and fast response time
                 y_corr_exp_hip_q = case_when(corr_exp_hip_q == 4 ~ 1,
                                              corr_exp_hip_q == 5 ~ 0, 
                                              T ~ corr_exp_hip_q),
                 
                 # Outcome for exercise with corruption suggestion,
                 # experienced gestor, low price and slow response time
                 y_corr_exp_lowp_s = case_when(corr_exp_lowp_s == 4 ~ 1, 
                                               corr_exp_lowp_s == 5 ~ 0, 
                                               T ~ corr_exp_lowp_s),
                 
                 # Outcome for exercise with corruption suggestion,
                 # experienced gestor, high price and slow response time
                 y_corr_exp_hip_s = case_when(corr_exp_hip_s == 4 ~ 1, 
                                              corr_exp_hip_s == 5 ~ 0, 
                                              T ~ corr_exp_hip_s),
                 
                 # Outcome for exercise with corruption suggestion,
                 # non-experienced gestor, low price and fast response time
                 y_corr_no_exp_lowp_q = case_when(corr_no_exp_lowp_q == 4 ~ 1,
                                                  corr_no_exp_lowp_q == 5 ~ 0, 
                                                  T ~ corr_no_exp_lowp_q),
                 
                 # Outcome for exercise with corruption suggestion,
                 # non-experienced gestor, high price and fast response time
                 y_corr_no_exp_hip_q = case_when(corr_no_exp_hip_q == 4 ~ 1,
                                                 corr_no_exp_hip_q == 5 ~ 0,
                                                 T ~ corr_no_exp_hip_q),
                 
                 # Outcome for exercise with corruption suggestion,
                 # non-experienced gestor, low price and slow response time
                 y_corr_no_exp_lowp_s = case_when(corr_no_exp_lowp_s == 4 ~ 1, 
                                                  corr_no_exp_lowp_s == 5 ~ 0, 
                                                  T ~ corr_no_exp_lowp_s),
                 
                 # Outcome for exercise with corruption suggestion,
                 # non-experienced gestor, high price and slow response time
                 y_corr_no_exp_hip_s = case_when(corr_no_exp_hip_s == 4 ~ 1,
                                                 corr_no_exp_hip_s == 5 ~ 0,
                                                 T ~ corr_no_exp_hip_s),
                 
                 # Indicates whether the participant would have bargained the 
                 # price with the gestor
                 bargain = ifelse(bargain == 2, 0, bargain), 
                 
                 # Indicates whether the participant had previous thoughts 
                 # on migration or plans to migrate 
                 mig_thought = ifelse(mig_thought == 4, 0, mig_thought),
                 
                 # Indicates if the participant has used gestor services in the 
                 # past
                 prev_gest = case_when(prev_gest == 2 ~ 1,
                                       prev_gest == 3 ~ 0, 
                                       T ~ prev_gest),
                 
                 # Characterization of experience with gestores in the past: 
                 # Good, Bad, Neutral
                 gest_exp = case_when(gest_exp == 4 ~ 1, 
                                      gest_exp == 5 ~ 2,
                                      gest_exp == 6 ~ 3,
                                      T ~ gest_exp),
                 
                 # Academic ranking acording to GPA
                 acad_rank = case_when(acad_rank == 4 ~ 2,
                                       acad_rank == 5 ~ 3, 
                                       acad_rank== 6 ~ 4,
                                       acad_rank == 7 ~ 5, 
                                       T ~ acad_rank),
                 
                 # Income dependency of participant
                 income_source = case_when(income_source == 2 ~ 1,
                                           income_source == 3 ~ 0,
                                           T ~ income_source),
                 
                 # Monthly household income range of participant
                 m_house_income = case_when(m_house_income == 12 ~ 2,
                                            m_house_income == 13 ~ 3,
                                            m_house_income == 14 ~ 4,
                                            m_house_income == 15 ~ 5,
                                            m_house_income == 16 ~ 6,
                                            m_house_income == 17 ~ 7,
                                            m_house_income == 18 ~ 8, 
                                            m_house_income == 19 ~ 9,
                                            m_house_income == 20 ~ 10,
                                            m_house_income == 21 ~ 11,
                                            m_house_income == 22 ~ 12,
                                            m_house_income == 23 ~ 13,
                                            m_house_income == 24 ~ 14,
                                            T ~ m_house_income),
                 
                 # Survey channel 
                 survey_chann = case_when(survey_chann == 4 ~ 2,
                                          survey_chann == 5 ~ 3,
                                          survey_chann == 6 ~ 4, 
                                          T ~ survey_chann), 
                 
                 current_semest = as.numeric(current_semest))


# Modify variable labels 
clean_survey <- clean_survey %>% 
   dplyr::mutate(m_house_income = set_labels(m_house_income, labels = c('Menos de $23'= 1, 
                                                                 'Entre $23 - $50' = 2, 
                                                                 'Entre $50 - $100' = 3, 
                                                                 'Entre $100 - $150' = 4, 
                                                                 'Entre $150 - $200' = 5, 
                                                                 'Entre $200 - $250' = 6, 
                                                                 'Entre $250 - $ 300' = 7, 
                                                                 'Entre $300 - $350' = 8, 
                                                                 'Entre $350 - $400' = 9, 
                                                                 'Entre $400 - $450' = 10, 
                                                                 'Entre $450 - $500' = 11, 
                                                                 'Entre $500 - $550' = 12, 
                                                                 'Entre $550 - $600' = 13, 
                                                                 'Más de $600'= 14)),
                 
          survey_chann = set_labels(survey_chann, labels = c('Correo Institucional UCAB' = 1,
                                                             'Grupo WhatsApp' = 2, 
                                                             'Stand presencial en la UCAB' = 3, 
                                                             'Otro' = 4)),
          consent = set_labels(consent, labels = c('Sí' = 1, 
                                                   'No' = 0)), 
          
          y_no_corr_exp_lowp_q = set_labels(y_no_corr_exp_lowp_q, labels = c('Sí' = 1, 
                                                                             'No' = 0)),
          
          y_no_corr_exp_hip_q = set_labels(y_no_corr_exp_hip_q, labels = c('Sí' = 1, 
                                                                           'No' = 0)),
          
          y_no_corr_exp_lowp_s = set_labels(y_no_corr_exp_lowp_s, labels = c('Sí' = 1, 
                                                                             'No' = 0)),
          
          y_no_corr_exp_hip_s = set_labels(y_no_corr_exp_hip_s, labels = c('Sí' = 1, 
                                                                           'No' = 0)), 
          
          y_no_corr_no_ex_lowp_q = set_labels(y_no_corr_no_ex_lowp_q, labels = c('Sí' = 1, 
                                                                                 'No' = 0)), 
          
          y_no_corr_no_ex_hip_q = set_labels(y_no_corr_no_ex_hip_q, labels = c('Sí' = 1, 
                                                                               'No' = 0)), 
          
          y_no_corr_no_ex_lowp_s = set_labels(y_no_corr_no_ex_lowp_s, labels = c('Sí' = 1, 
                                                                                 'No' = 0)), 
          
          y_no_corr_no_ex_hip_s = set_labels(y_no_corr_no_ex_hip_s, labels = c('Sí' = 1, 
                                                                               'No' = 0)),
          
          y_corr_exp_lowp_q = set_labels(y_corr_exp_lowp_q, labels = c('Sí' = 1, 
                                                                       'No' = 0)), 
          
          y_corr_exp_hip_q = set_labels(y_corr_exp_hip_q, labels = c('Sí' = 1, 
                                                                     'No' = 0)),
          
          y_corr_exp_lowp_s = set_labels(y_corr_exp_lowp_s, labels = c('Sí' = 1, 
                                                                       'No' = 0)), 
          
          y_corr_no_exp_hip_s = set_labels(y_corr_no_exp_hip_s, labels = c('Sí' = 1, 
                                                                           'No' = 0)), 
          
          bargain = set_labels(bargain, labels = c('Sí' = 1, 
                                                   'No' = 0)), 
          
          mig_thought = set_labels(mig_thought, labels = c('Sí' = 1, 
                                                           'No' = 0)), 
          
          prev_gest = set_labels(prev_gest, labels = c('Sí' = 1, 
                                                       'No' = 0)), 
          
          gest_exp = set_labels(gest_exp, labels = c('Buena' = 1, 
                                                     'Mala' = 2, 
                                                     'Neutro' = 3)),
          
          acad_rank = set_labels(acad_rank, labels = c('En el top 5%' = 1, 
                                                       '5% - 20%' = 2,
                                                       '20% - 50%' = 3, 
                                                       '50% - 80%' = 4, 
                                                       
                                                       'Por debajo del 80%' = 5)), 
          
          income_source = set_labels(income_source, labels = c('Ingresos propios, soy económicamente independiente.' = 1, 
                                                               'Dependo económicamente de alguien' = 0)),
          
          univ_career = set_labels(univ_career, labels = c("Administración" = 1, 
                                                           "Contaduría" = 2, 
                                                           "Comunicación social" = 3, 
                                                           "Derecho" = 4, 
                                                           "Economía" = 5, 
                                                           "Filosofía" = 6, 
                                                           "Letras" = 7, 
                                                           "Ingeniería civil" = 8, 
                                                           "Ingeniería industrial" = 9, 
                                                           "Ingeniería informática" = 10, 
                                                           "Ingeniería en telecomunicaciones" = 11, 
                                                           "Psicología" = 12, 
                                                           "Teología" = 13, 
                                                           "Relaciones industriales" = 14, 
                                                           "Sociología" = 15, 
                                                           "Otro" = 16, 
                                                           "Arquitectura" = 17))) %>% 
   
   # Selecting variables of interest
   select(c(Finished, Progress, Q_RelevantIDDuplicateScore ,Duration__in_seconds_, consent, y_corr_exp_lowp_q, y_corr_exp_hip_q, y_corr_exp_lowp_s,
            y_corr_exp_hip_s, y_corr_no_exp_lowp_q, y_corr_no_exp_hip_q, y_corr_no_exp_lowp_s ,
            y_corr_no_exp_hip_s, y_no_corr_exp_lowp_q, y_no_corr_exp_hip_q, y_no_corr_exp_lowp_s,
            y_no_corr_exp_hip_s, y_no_corr_no_ex_lowp_q, y_no_corr_no_ex_hip_q, y_no_corr_no_ex_lowp_s,
            y_no_corr_no_ex_hip_s, willing_to_pay, bargain, mig_thought, prev_gest,gest_exp, gender, age, univ_career,
            univ_start_yr, current_semest, acad_rank, income_source, 
            m_house_income, survey_chann, contacto_situacion_DO_primo_, contacto_situacion_DO_google,
            tratamientos_main_DO_no_corr_exp_lowp_q,tratamientos_main_DO_no_corr_no_ex_hip_q, tratamientos_main_DO_corr_no_exp_hip_q,
            tratamientos_main_DO_corr_exp_hip_q, tratamientos_main_DO_no_corr_exp_hip_q,tratamientos_main_DO_corr_no_exp_lowp_s, tratamientos_main_DO_corr_exp_lowp_s,
            tratamientos_main_DO_no_corr_no_ex_lowp_s, tratamientos_main_DO_corr_no_exp_hip_s, tratamientos_main_DO_no_corr_exp_lowp_s, 
            tratamientos_main_DO_corr_exp_hip_s, tratamientos_main_DO_no_corr_no_ex_hip_s, tratamientos_main_DO_corr_no_exp_lowp_q,
            tratamientos_main_DO_corr_exp_lowp_q, tratamientos_main_DO_no_corr_exp_hip_s, tratamientos_main_DO_no_corr_no_ex_lowp_q))


# Creating treatment and outcome variables
clean_survey$Sug_Corr <- '-'
clean_survey$Exper <- '-'
clean_survey$Precio <- '-'
clean_survey$Rapidez <- '-'
clean_survey$GestLink <- '-'
clean_survey$Y <- '-'


# Assigning values to the treatment and outcome variables

clean_survey <- clean_survey %>%
   
   mutate(Sug_Corr = ifelse(compareNA(tratamientos_main_DO_corr_no_exp_hip_q,1) | 
                               compareNA(tratamientos_main_DO_corr_exp_hip_q, 1) | 
                               compareNA(tratamientos_main_DO_corr_no_exp_lowp_s, 1) |
                               compareNA(tratamientos_main_DO_corr_exp_lowp_s, 1) | 
                               compareNA(tratamientos_main_DO_corr_no_exp_hip_s, 1) | 
                               compareNA(tratamientos_main_DO_corr_exp_hip_s, 1) | 
                               compareNA(tratamientos_main_DO_corr_no_exp_lowp_q, 1) | 
                               compareNA(tratamientos_main_DO_corr_exp_lowp_q, 1), 
                            yes = 1, no = 0), 
          
          Exper = ifelse(compareNA(tratamientos_main_DO_corr_exp_hip_q, 1) | 
                            compareNA(tratamientos_main_DO_corr_exp_hip_s, 1) | 
                            compareNA(tratamientos_main_DO_corr_exp_lowp_q, 1) | 
                            compareNA(tratamientos_main_DO_corr_exp_lowp_s, 1) | 
                            compareNA(tratamientos_main_DO_no_corr_exp_hip_q, 1) | 
                            compareNA(tratamientos_main_DO_no_corr_exp_hip_s, 1) | 
                            compareNA(tratamientos_main_DO_no_corr_exp_lowp_q, 1) | 
                            compareNA(tratamientos_main_DO_no_corr_exp_lowp_s, 1), 
                         yes = 1, no = 0), 
          
          Precio = ifelse(compareNA(tratamientos_main_DO_corr_exp_lowp_q, 1) | 
                             compareNA(tratamientos_main_DO_corr_exp_lowp_s, 1) | 
                             compareNA(tratamientos_main_DO_corr_no_exp_lowp_q, 1) | 
                             compareNA(tratamientos_main_DO_corr_no_exp_lowp_s, 1) | 
                             compareNA(tratamientos_main_DO_no_corr_exp_lowp_q, 1) | 
                             compareNA(tratamientos_main_DO_no_corr_exp_lowp_s, 1) | 
                             compareNA(tratamientos_main_DO_no_corr_no_ex_lowp_q,  1) | 
                             compareNA(tratamientos_main_DO_no_corr_no_ex_lowp_s, 1), 
                          yes = 0, no = 1),
          
          Rapidez = ifelse(compareNA(tratamientos_main_DO_corr_exp_hip_q, 1) | 
                              compareNA(tratamientos_main_DO_corr_exp_lowp_q, 1) | 
                              compareNA(tratamientos_main_DO_corr_no_exp_hip_q, 1) | 
                              compareNA(tratamientos_main_DO_corr_no_exp_lowp_q, 1) | 
                              compareNA(tratamientos_main_DO_no_corr_exp_hip_q, 1) | 
                              compareNA(tratamientos_main_DO_no_corr_exp_lowp_q, 1) | 
                              compareNA(tratamientos_main_DO_no_corr_no_ex_hip_q, 1) | 
                              compareNA(tratamientos_main_DO_no_corr_no_ex_lowp_q, 1), 
                           yes =1, no= 0),
          
          GestLink = ifelse(compareNA(contacto_situacion_DO_primo_, 1), 1, 0),
          
          Y = rowSums(.[6:21], na.rm = T), 
          
          Sug_Corr = set_labels(Sug_Corr, labels = c('Corrupto' = 1,
                                                     'No Corrupto' = 0)),
          
          Exper = set_labels(Exper, labels = c('Experiencia' = 1,
                                               'No experiencia' = 0)), 
          
          Precio = set_labels(Precio, labels = c('Bajo' = 0,
                                                 'Alto' = 1)), 
          
          Rapidez = set_labels(Rapidez, labels = c('Rápido' = 1,
                                                   'Lento' = 0)),
          Y = set_labels(Y, labels = c('Sí' = 1,
                                       'No' = 0)), 
          
          GestLink = set_labels(GestLink, labels = c("Primo" = 1, 
                                                     "Google" = 0))
   ) 


# Final table for regression analysis 

tabla_survey <- clean_survey %>% 
   
   select(Finished, Progress, Q_RelevantIDDuplicateScore ,consent, 
          Y, Sug_Corr, Precio, Exper, Rapidez, GestLink, willing_to_pay, 
          bargain, mig_thought, prev_gest, gest_exp ,gender, 
          age, univ_career, univ_start_yr, current_semest, 
          acad_rank, income_source, m_house_income, survey_chann) %>% 
   
   transform(gender = as.factor(gender), 
             univ_career = as.factor(univ_career), 
             current_semest = as.factor(current_semest), 
             acad_rank = as.factor(acad_rank), 
             m_house_income = as.factor(m_house_income), 
             survey_chann = as.factor(survey_chann)) 


# EXPORT  -----------------------------------------------------------------

# EXCEL 
write_xlsx(tabla_survey, 'Data/Output/Dataset/01_dataset_transparent_corruption.xlsx')

# CSV 
write_csv(tabla_survey, 'Data/Output/Dataset/01_dataset_transparent_corruption.csv')

### ----------------------------------------------------------------------

rm(list = setdiff(ls(), 'tabla_survey'))
